1 and 2. Look at the chapter on interactive graphics and, specifically, the code to display a subject's MRICloud data as a sunburst plot. Do the following. Display this subject's data as a Sankey diagram. Display as many levels as you can for type = 1, starting from the intracranial volume. Put this in a file called hw4.ipynb.
import pandas as pd
import plotly.express as px
import numpy as np
import os
import plotly.graph_objects as go
## load in the hierarchy information
url = "https://raw.githubusercontent.com/bcaffo/MRIcloudT1volumetrics/master/inst/extdata/multilevel_lookup_table.txt"
multilevel_lookup = pd.read_csv(url, sep = "\t").drop(['Level5'], axis = 1)
multilevel_lookup = multilevel_lookup.rename(columns = {
"modify" : "roi",
"modify.1" : "level4",
"modify.2" : "level3",
"modify.3" : "level2",
"modify.4" : "level1"})
multilevel_lookup = multilevel_lookup[['roi', 'level4', 'level3', 'level2', 'level1']]
multilevel_lookup.head()
| roi | level4 | level3 | level2 | level1 | |
|---|---|---|---|---|---|
| 0 | SFG_L | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L |
| 1 | SFG_R | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R |
| 2 | SFG_PFC_L | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L |
| 3 | SFG_PFC_R | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R |
| 4 | SFG_pole_L | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L |
## Now load in the subject data
id = 127
subjectData = pd.read_csv("kirby21AllLevels.csv")
subjectData = subjectData.loc[(subjectData.type == 1) & (subjectData.level == 5) & (subjectData.id == id)]
# Show only this subject's region (ROI) and volume
subjectData = subjectData[['roi', 'volume']]
## Merge the subject data with the multilevel data
subjectData = pd.merge(subjectData, multilevel_lookup, on = "roi")
subjectData = subjectData.assign(icv = "ICV")
subjectData = subjectData.assign(comp = subjectData.volume / np.sum(subjectData.volume))
subjectData.head()
| roi | volume | level4 | level3 | level2 | level1 | icv | comp | |
|---|---|---|---|---|---|---|---|---|
| 0 | SFG_L | 12926 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.009350 |
| 1 | SFG_R | 10050 | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R | ICV | 0.007270 |
| 2 | SFG_PFC_L | 12783 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.009247 |
| 3 | SFG_PFC_R | 11507 | SFG_R | Frontal_R | CerebralCortex_R | Telencephalon_R | ICV | 0.008324 |
| 4 | SFG_pole_L | 3078 | SFG_L | Frontal_L | CerebralCortex_L | Telencephalon_L | ICV | 0.002227 |
subjectData = subjectData.drop(['volume'], axis=1)
df2=subjectData.groupby(['icv', 'level1'], as_index=False)['comp'].count()
df2.head()
df3=subjectData.groupby(['level1', 'level2'], as_index=False)['comp'].count()
df3.head()
df4=subjectData.groupby(['level2','level3'], as_index=False)['comp'].count()
df4.head()
df5=subjectData.groupby(['level3','level4'], as_index=False)['comp'].count()
df5.head()
df6=subjectData.groupby(['level4','roi'], as_index=False)['comp'].count()
df6.head()
| level4 | roi | comp | |
|---|---|---|---|
| 0 | AG_L | AG_L | 1 |
| 1 | AG_R | AG_R | 1 |
| 2 | ALIC_L | ALIC_L | 1 |
| 3 | ALIC_R | ALIC_R | 1 |
| 4 | Amyg_L | Amyg_L | 1 |
df2.columns = ['a', 'b', 'comp']
df3.columns = ['a', 'b', 'comp']
df4.columns = ['a', 'b', 'comp']
df5.columns = ['a', 'b', 'comp']
df6.columns = ['a', 'b', 'comp']
df7 = df2.append(df3)
df8 = df7.append(df4)
df9 = df8.append(df5)
df10 = df9.append(df6)
df10
| a | b | comp | |
|---|---|---|---|
| 0 | ICV | CSF | 32 |
| 1 | ICV | Diencephalon_L | 8 |
| 2 | ICV | Diencephalon_R | 8 |
| 3 | ICV | Mesencephalon | 8 |
| 4 | ICV | Metencephalon | 20 |
| ... | ... | ... | ... |
| 269 | midbrain_R | Midbrain_R | 1 |
| 270 | midbrain_R | RedNc_R | 1 |
| 271 | midbrain_R | Snigra_R | 1 |
| 272 | post_DPWM_L | PCR_L | 1 |
| 273 | post_DPWM_R | PCR_R | 1 |
500 rows × 3 columns
import chart_studio.plotly as py
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
# maximum of 6 value cols -> 6 colors
colorPalette = ['#FFD43B','#646464','#4B8BBE','#306998']
labelList = []
colorNumList = []
for catCol in cat_cols:
labelListTemp = list(set(df[catCol].values))
colorNumList.append(len(labelListTemp))
labelList = labelList + labelListTemp
# remove duplicates from labelList
labelList = list(dict.fromkeys(labelList))
# define colors based on number of levels
colorList = []
for idx, colorNum in enumerate(colorNumList):
colorList = colorList + [colorPalette[idx]]*colorNum
# transform df into a source-target pair
for i in range(len(cat_cols)-1):
if i==0:
sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
sourceTargetDf.columns = ['source','target','count']
else:
tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
tempDf.columns = ['source','target','count']
sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
# add index for source-target pair
sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
# creating the sankey diagram
data = dict(
type='sankey',
node = dict(
pad = 15,
thickness = 20,
line = dict(
color = "black",
width = 0.5
),
label = labelList,
color = colorList
),
link = dict(
source = sourceTargetDf['sourceID'],
target = sourceTargetDf['targetID'],
value = sourceTargetDf['count']
)
)
layout = dict(
title = title,
font = dict(
size = 10
)
)
fig = dict(data=[data], layout=layout)
return fig
import plotly
fig = go.Figure(genSankey(df10,
cat_cols=['a','b'],
value_cols='comp',
title='Sankey Diagram of Brain Regions'),
)
fig.update_layout(
autosize=False,
width=1000,
height=2000,
paper_bgcolor="LightSteelBlue",
)
#plotly.offline.plot(fig, validate=False)
fig.show()
#fig.to_html()
annual = pd.read_csv("county_annual.csv")
population = pd.read_csv("county_pop_arcos.csv")
land = pd.read_csv("land_area.csv")
annual.describe()
| Unnamed: 0 | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|
| count | 27758.000000 | 27758.000000 | 27758.000000 | 2.775800e+04 | 26998.000000 |
| mean | 13879.500000 | 2009.998559 | 8331.492723 | 3.539313e+06 | 30267.350507 |
| std | 8013.188722 | 2.583978 | 19519.183369 | 9.069074e+06 | 15124.119078 |
| min | 1.000000 | 2006.000000 | 1.000000 | 6.000000e+00 | 1001.000000 |
| 25% | 6940.250000 | 2008.000000 | 930.000000 | 3.302225e+05 | 18169.000000 |
| 50% | 13879.500000 | 2010.000000 | 2804.000000 | 1.062635e+06 | 29115.000000 |
| 75% | 20818.750000 | 2012.000000 | 7391.750000 | 3.061181e+06 | 45039.000000 |
| max | 27758.000000 | 2014.000000 | 465799.000000 | 2.106367e+08 | 56045.000000 |
population.describe()
| Unnamed: 0 | countyfips | STATE | COUNTY | year | population | |
|---|---|---|---|---|---|---|
| count | 28265.000000 | 28265.000000 | 28265.000000 | 28265.000000 | 28265.000000 | 2.826500e+04 |
| mean | 14133.000000 | 30405.603856 | 30.301857 | 103.747285 | 2009.999222 | 9.730740e+04 |
| std | 8159.547016 | 15155.969500 | 15.137698 | 107.945637 | 2.582094 | 3.109070e+05 |
| min | 1.000000 | 1001.000000 | 1.000000 | 1.000000 | 2006.000000 | 4.100000e+01 |
| 25% | 7067.000000 | 18181.000000 | 18.000000 | 35.000000 | 2008.000000 | 1.108700e+04 |
| 50% | 14133.000000 | 29179.000000 | 29.000000 | 79.000000 | 2010.000000 | 2.570000e+04 |
| 75% | 21199.000000 | 45083.000000 | 45.000000 | 133.000000 | 2012.000000 | 6.632300e+04 |
| max | 28265.000000 | 56045.000000 | 56.000000 | 840.000000 | 2014.000000 | 9.974203e+06 |
land.describe()
| Unnamed: 0 | STCOU | LND010190F | LND010190D | LND010190N1 | LND010190N2 | LND010200F | LND010200D | LND010200N1 | LND010200N2 | ... | LND110210N1 | LND110210N2 | LND210190F | LND210190D | LND210190N1 | LND210190N2 | LND210200F | LND210200D | LND210200N1 | LND210200N2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 3198.000000 | 3198.000000 | 3198.000000 | 3.198000e+03 | 3198.000000 | 3198.0 | 3198.000000 | 3.198000e+03 | 3198.000000 | 3198.0 | ... | 3198.0 | 3198.0 | 3198.000000 | 3198.000000 | 3198.000000 | 3198.0 | 3198.000000 | 3198.000000 | 3198.000000 | 3198.0 |
| mean | 1599.500000 | 30371.331144 | 0.005003 | 3.552932e+03 | 11.130707 | 0.0 | 0.006567 | 3.559177e+03 | 6.362727 | 0.0 | ... | 0.0 | 0.0 | 0.005003 | 235.537852 | 11.130707 | 0.0 | 0.006567 | 240.754841 | 6.362727 | 0.0 |
| std | 923.327407 | 15183.455899 | 0.117209 | 6.872234e+04 | 237.686829 | 0.0 | 0.162924 | 6.884981e+04 | 179.824440 | 0.0 | ... | 0.0 | 0.0 | 0.117209 | 4785.627574 | 237.686829 | 0.0 | 0.162924 | 4908.297150 | 179.824440 | 0.0 |
| min | 1.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.0 | 0.000000 | 0.000000e+00 | 0.000000 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.0 |
| 25% | 800.250000 | 18173.500000 | 0.000000 | 4.468625e+02 | 0.000000 | 0.0 | 0.000000 | 4.475425e+02 | 0.000000 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 1.940000 | 0.000000 | 0.0 | 0.000000 | 1.960000 | 0.000000 | 0.0 |
| 50% | 1599.500000 | 29178.000000 | 0.000000 | 6.564800e+02 | 0.000000 | 0.0 | 0.000000 | 6.570400e+02 | 0.000000 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 6.520000 | 0.000000 | 0.0 | 0.000000 | 6.575000 | 0.000000 | 0.0 |
| 75% | 2398.750000 | 45078.500000 | 0.000000 | 1.014990e+03 | 0.000000 | 0.0 | 0.000000 | 1.016825e+03 | 0.000000 | 0.0 | ... | 0.0 | 0.0 | 0.000000 | 23.090000 | 0.000000 | 0.0 | 0.000000 | 23.327500 | 0.000000 | 0.0 |
| max | 3198.000000 | 56045.000000 | 5.000000 | 3.787425e+06 | 5090.000000 | 0.0 | 5.000000 | 3.794083e+06 | 5090.000000 | 0.0 | ... | 0.0 | 0.0 | 5.000000 | 251083.350000 | 5090.000000 | 0.0 | 5.000000 | 256644.620000 | 5090.000000 | 0.0 |
8 rows × 34 columns
population = population.drop(columns="Unnamed: 0", axis=1)
| BUYER_COUNTY | BUYER_STATE | countyfips | STATE | COUNTY | county_name | NAME | variable | year | population | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AUTAUGA | AL | 1001 | 1 | 1 | Autauga | Autauga County, Alabama | B01003_001 | 2006 | 51328 |
| 1 | BALDWIN | AL | 1003 | 1 | 3 | Baldwin | Baldwin County, Alabama | B01003_001 | 2006 | 168121 |
| 2 | BARBOUR | AL | 1005 | 1 | 5 | Barbour | Barbour County, Alabama | B01003_001 | 2006 | 27861 |
| 3 | BIBB | AL | 1007 | 1 | 7 | Bibb | Bibb County, Alabama | B01003_001 | 2006 | 22099 |
| 4 | BLOUNT | AL | 1009 | 1 | 9 | Blount | Blount County, Alabama | B01003_001 | 2006 | 55485 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28260 | WASHAKIE | WY | 56043 | 56 | 43 | Washakie | Washakie County, Wyoming | B01003_001 | 2014 | 8444 |
| 28261 | WESTON | WY | 56045 | 56 | 45 | Weston | Weston County, Wyoming | B01003_001 | 2014 | 7135 |
| 28262 | SKAGWAY | AK | 2230 | 2 | 230 | Skagway | Skagway Municipality, Alaska | B01003_001 | 2014 | 996 |
| 28263 | HOONAH ANGOON | AK | 2105 | 2 | 105 | Hoonah Angoon | Hoonah-Angoon Census Area, Alaska | B01003_001 | 2014 | 2126 |
| 28264 | PETERSBURG | AK | 2195 | 2 | 195 | Petersburg | Petersburg Borough, Alaska | B01003_001 | 2014 | 3212 |
28265 rows × 10 columns
annual = annual.drop(columns="Unnamed: 0", axis=1)
land = land.drop(columns="Unnamed: 0", axis=1)
population.head()
| BUYER_COUNTY | BUYER_STATE | countyfips | STATE | COUNTY | county_name | NAME | variable | year | population | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AUTAUGA | AL | 1001 | 1 | 1 | Autauga | Autauga County, Alabama | B01003_001 | 2006 | 51328 |
| 1 | BALDWIN | AL | 1003 | 1 | 3 | Baldwin | Baldwin County, Alabama | B01003_001 | 2006 | 168121 |
| 2 | BARBOUR | AL | 1005 | 1 | 5 | Barbour | Barbour County, Alabama | B01003_001 | 2006 | 27861 |
| 3 | BIBB | AL | 1007 | 1 | 7 | Bibb | Bibb County, Alabama | B01003_001 | 2006 | 22099 |
| 4 | BLOUNT | AL | 1009 | 1 | 9 | Blount | Blount County, Alabama | B01003_001 | 2006 | 55485 |
print(annual[annual['countyfips'].isnull()])
BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips 187 ADJUNTAS PR 2006 147 102800.0 NaN 188 ADJUNTAS PR 2007 153 104800.0 NaN 189 ADJUNTAS PR 2008 153 45400.0 NaN 190 ADJUNTAS PR 2009 184 54200.0 NaN 191 ADJUNTAS PR 2010 190 56200.0 NaN ... ... ... ... ... ... ... 27753 NaN NV 2007 447 200600.0 NaN 27754 NaN NV 2008 5 2200.0 NaN 27755 NaN OH 2006 23 5100.0 NaN 27756 NaN PR 2006 10 17800.0 NaN 27757 NaN PR 2007 2 1300.0 NaN [760 rows x 6 columns]
print(annual[(annual['countyfips'].isnull()) & (annual["BUYER_STATE"] != "PR")])
BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips 10071 GUAM GU 2006 319 265348.0 NaN 10072 GUAM GU 2007 330 275600.0 NaN 10073 GUAM GU 2008 313 286900.0 NaN 10074 GUAM GU 2009 390 355300.0 NaN 10075 GUAM GU 2010 510 413800.0 NaN ... ... ... ... ... ... ... 27751 NaN MA 2006 247 114900.0 NaN 27752 NaN NV 2006 380 173600.0 NaN 27753 NaN NV 2007 447 200600.0 NaN 27754 NaN NV 2008 5 2200.0 NaN 27755 NaN OH 2006 23 5100.0 NaN [74 rows x 6 columns]
annual["countyfips"] = annual["countyfips"].astype(str)
annual.loc[(annual["BUYER_STATE"] == "AR") & (annual["BUYER_COUNTY"] == "MONTGOMERY"), "countyfips"] = "05097"
print(annual.loc[(annual["BUYER_STATE"] == "AR") & (annual["BUYER_COUNTY"] == "MONTGOMERY")])
BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips 17429 MONTGOMERY AR 2006 469 175390.0 05097 17430 MONTGOMERY AR 2007 597 241270.0 05097 17431 MONTGOMERY AR 2008 561 251760.0 05097 17432 MONTGOMERY AR 2009 554 244160.0 05097 17433 MONTGOMERY AR 2010 449 247990.0 05097 17434 MONTGOMERY AR 2011 560 313800.0 05097 17435 MONTGOMERY AR 2012 696 339520.0 05097 17436 MONTGOMERY AR 2013 703 382300.0 05097 17437 MONTGOMERY AR 2014 491 396900.0 05097
#sqlite> delete from annual where BUYER_COUNTY = "NA"
print(annual[annual['BUYER_COUNTY'].isnull()])
BUYER_COUNTY BUYER_STATE year count DOSAGE_UNIT countyfips 27741 NaN AE 2006 2 330.0 nan 27742 NaN CA 2006 47 12600.0 nan 27743 NaN CT 2006 305 78700.0 nan 27744 NaN CT 2007 112 30900.0 nan 27745 NaN CT 2008 48 15000.0 nan 27746 NaN FL 2006 9 900.0 nan 27747 NaN FL 2007 7 700.0 nan 27748 NaN GA 2006 114 51700.0 nan 27749 NaN IA 2006 7 2300.0 nan 27750 NaN IN 2006 292 39300.0 nan 27751 NaN MA 2006 247 114900.0 nan 27752 NaN NV 2006 380 173600.0 nan 27753 NaN NV 2007 447 200600.0 nan 27754 NaN NV 2008 5 2200.0 nan 27755 NaN OH 2006 23 5100.0 nan 27756 NaN PR 2006 10 17800.0 nan 27757 NaN PR 2007 2 1300.0 nan
annual = annual[annual['BUYER_COUNTY'].notnull()]
print(annual[annual['BUYER_COUNTY'].isnull()])
Empty DataFrame Columns: [BUYER_COUNTY, BUYER_STATE, year, count, DOSAGE_UNIT, countyfips] Index: []
# sqlite> create table land_area as select Areaname, STCOU, LND110210D from land;
land_area = land[["Areaname", "STCOU", "LND110210D"]]
# sqlite> alter table land_area rename column STCOU to countyfips;
land_area = land_area.rename(columns={"STCOU": "countyfips"})
land_area
| Areaname | countyfips | LND110210D | |
|---|---|---|---|
| 0 | UNITED STATES | 0 | 3531905.43 |
| 1 | ALABAMA | 1000 | 50645.33 |
| 2 | Autauga, AL | 1001 | 594.44 |
| 3 | Baldwin, AL | 1003 | 1589.78 |
| 4 | Barbour, AL | 1005 | 884.88 |
| ... | ... | ... | ... |
| 3193 | Sweetwater, WY | 56037 | 10426.65 |
| 3194 | Teton, WY | 56039 | 3995.38 |
| 3195 | Uinta, WY | 56041 | 2081.26 |
| 3196 | Washakie, WY | 56043 | 2238.55 |
| 3197 | Weston, WY | 56045 | 2398.09 |
3198 rows × 3 columns
#sqlite> create table county_info as select * from population left join land_area using(countyfips);
county_info = pd.merge(population, land_area, left_on = "countyfips", right_on = "countyfips")
county_info.head()
| BUYER_COUNTY | BUYER_STATE | countyfips | STATE | COUNTY | county_name | NAME | variable | year | population | Areaname | LND110210D | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AUTAUGA | AL | 1001 | 1 | 1 | Autauga | Autauga County, Alabama | B01003_001 | 2006 | 51328 | Autauga, AL | 594.44 |
| 1 | AUTAUGA | AL | 1001 | 1 | 1 | Autauga | Autauga County, Alabama | B01003_001 | 2007 | 52405 | Autauga, AL | 594.44 |
| 2 | AUTAUGA | AL | 1001 | 1 | 1 | Autauga | Autauga County, Alabama | B01003_001 | 2008 | 53277 | Autauga, AL | 594.44 |
| 3 | AUTAUGA | AL | 1001 | 1 | 1 | Autauga | Autauga County, Alabama | B01003_001 | 2009 | 49584 | Autauga, AL | 594.44 |
| 4 | AUTAUGA | AL | 1001 | 1 | 1 | Autauga | Autauga County, Alabama | B01003_001 | 2010 | 53155 | Autauga, AL | 594.44 |
len(land)
3198
len(land_area)
3198
len(county_info)
28256
len(population)
28265
annual.head()
| BUYER_COUNTY | BUYER_STATE | year | count | DOSAGE_UNIT | countyfips | |
|---|---|---|---|---|---|---|
| 0 | ABBEVILLE | SC | 2006 | 877 | 363620.0 | 45001.0 |
| 1 | ABBEVILLE | SC | 2007 | 908 | 402940.0 | 45001.0 |
| 2 | ABBEVILLE | SC | 2008 | 871 | 424590.0 | 45001.0 |
| 3 | ABBEVILLE | SC | 2009 | 930 | 467230.0 | 45001.0 |
| 4 | ABBEVILLE | SC | 2010 | 1197 | 539280.0 | 45001.0 |
annual_group = annual.groupby(by=['year'])['DOSAGE_UNIT'].mean().reset_index()
annual_group
| year | DOSAGE_UNIT | |
|---|---|---|
| 0 | 2006 | 2.654669e+06 |
| 1 | 2007 | 2.995906e+06 |
| 2 | 2008 | 3.254271e+06 |
| 3 | 2009 | 3.526038e+06 |
| 4 | 2010 | 3.783656e+06 |
| 5 | 2011 | 4.035583e+06 |
| 6 | 2012 | 3.993064e+06 |
| 7 | 2013 | 3.861752e+06 |
| 8 | 2014 | 3.768738e+06 |
fig_opioids = px.scatter(annual_group, x="year", y="DOSAGE_UNIT", title="Average Opioid Pill Shipments by Year")
fig_opioids.show()
fig_opioids_2 = px.line(annual_group, x="year", y="DOSAGE_UNIT", title="Average Opioid Pill Shipments by Year", markers=True)
fig_opioids_2.show()